24-nov
- add dbscan to basic clustering ...
Train and adjust parameters
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import warnings
warnings.simplefilter('ignore',DeprecationWarning)
import seaborn as sns
import time
import copy
from pylab import rcParams
#import hdbscan
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn import metrics
from sklearn import metrics as mt
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import confusion_matrix as conf
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.cluster import KMeans
from tabulate import tabulate
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from __future__ import print_function
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... read in cleaned data set
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
data_dir = '../data/'
data_file = 'mashable_clean_dataset_for_lab_03.csv'
file_2_read = data_dir + data_file
df = pd.read_csv(file_2_read)
df_cluster = copy.deepcopy(df)
df_cluster.head()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... add 'popular' binary based 'shares' values
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
df_cluster['shares'] = np.exp(df_cluster['ln_shares'])
df_cluster['popular'] = np.where(df_cluster['shares'] > 1400, 1, 0)
df_cluster.head()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... read in t-SNE vectors
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
data_dir = '../data/'
data_file = 't_sne_mapping_perplex_0100.csv'
file_2_read = data_dir + data_file
df_tsne = pd.read_csv(file_2_read)
df_tsne.head()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... join t-sne vectors with base data, since we sampled to create
# ... the t-sne mapping
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
df_join = df_tsne.join(df_cluster, on = 'sample_index')
df_join.head()
col_names = df_join.columns.values.tolist()
col_names
df_join.describe().T
# set required variables for model comparison
kmeans_tbl = pd.DataFrame(columns = [
'model_name',
'n_clusters',
'inertia',
'silhouette',
'process_time'])
i_index = []
i_index = 0
# preparation for cross validation and model comparison, each classifier is appended once model is fit
models = []
# ... k-means on the t-sne vectors
X_tsne = pd.DataFrame(columns=['t1', 't2'])
X_tsne['t1'] = df_join['x-tsne']
X_tsne['t2'] = df_join['y-tsne']
for n_lda in range(2, 21):
tic = time.clock()
print ("n_lda = ", n_lda)
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1);
cls_lda.fit(X_tsne)
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("inertia = ", kmeans_inertia)
kmeans_silhouette = metrics.silhouette_score(X_tsne,
kmeans_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", kmeans_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'KMeans - LDA features',
'n_clusters' : n_lda,
'inertia': kmeans_inertia,
'silhouette': kmeans_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
kmeans_tbl = kmeans_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - make some plots of clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
_ = plt.figure(figsize=(12, 8));
_ = plt.subplot(111, facecolor = 'darkgrey');
X_tsne_values = X_tsne.values;
_ = plt.scatter(X_tsne_values[:, 0], X_tsne_values[:, 1],
c = kmeans_labels,
cmap = plt.cm.Paired,
s = 50,
linewidths = 0,
alpha = 0.20);
_ = plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
c = range(n_lda),
cmap = plt.cm.Paired,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
for ii in range(n_lda) :
_ = plt.text(kmeans_centers[ii, 0], kmeans_centers[ii, 1], ii, fontsize = 40)
print(ii, kmeans_centers[ii, 0], kmeans_centers[ii, 1], ii)
_ = plt.xlabel('t-SNE axis 1')
_ = plt.ylabel('t-SNE axis 2');
_ = plt.grid(True);
plt.show();
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... -
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - plot metrics across models for comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(16, 6));
# ... silhouette values
plt.subplot(131);
plt.scatter(kmeans_tbl['n_clusters'],
kmeans_tbl['silhouette'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(kmeans_tbl['n_clusters'],
kmeans_tbl['silhouette'])
plt.xlabel('n_clusters'), plt.ylabel('silhouette');
plt.grid();
# ... inertia values
plt.subplot(132);
plt.scatter(kmeans_tbl['n_clusters'],
kmeans_tbl['inertia'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(kmeans_tbl['n_clusters'],
kmeans_tbl['inertia'])
plt.xlabel('n_clusters'), plt.ylabel('inertia');
plt.grid();
# ... process time
plt.subplot(133);
plt.scatter(kmeans_tbl['n_clusters'],
kmeans_tbl['process_time'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
#plt.plot(kmeans_tbl['n_clusters'],
# kmeans_tbl['process_time'])
plt.xlabel('n_clusters'), plt.ylabel('process_time');
plt.grid();
plt.show();
n_clusters_chosen = 4
for n_lda in [4, 15] :
tic = time.clock()
print ("n_lda = ", n_lda)
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1);
cls_lda.fit(X_tsne)
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("inertia = ", kmeans_inertia)
kmeans_silhouette = metrics.silhouette_score(X_tsne,
kmeans_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", kmeans_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - make some plots of clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(12, 12));
ax = plt.gca();
X_tsne_values = X_tsne.values;
plt.scatter(X_tsne_values[:, 0], X_tsne_values[:, 1],
c = df_join['popular'],
cmap = plt.cm.Spectral,
s = 50,
linewidths = 0,
alpha = 0.20);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
c = range(n_lda),
cmap = plt.cm.tab20,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
for ii in range(n_lda) :
plt.text(kmeans_centers[ii, 0], kmeans_centers[ii, 1], ii, fontsize = 40)
print(ii, kmeans_centers[ii, 0], kmeans_centers[ii, 1], ii)
print(kmeans_labels)
print(kmeans_centers)
plt.xlabel('t1'), plt.ylabel('t2');
plt.grid();
plt.show();
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - print out comparison table
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
len(kmeans_labels)
X_all_together = copy.deepcopy(df_join)
len(X_all_together)
X_all_together['kmeans_labels'] = kmeans_labels
X_all_together.describe().T
col_names = X_all_together.columns.values.tolist()
# boxplot across clusters for each feature ...
import seaborn as sns
col_names = X_all_together.columns.values.tolist()
for col in col_names :
_ = plt.figure(figsize=(24, 8));
# ... feature distribution color map
_ = plt.subplot(131, facecolor = 'darkgrey');
_ = plt.scatter(X_all_together['x-tsne'], X_all_together['y-tsne'],
c = X_all_together[col],
cmap = plt.cm.Spectral,
s = 50,
linewidths = 0,
alpha = 0.30)
_ = plt.title(col)
# ... feature boxplots
_ = plt.subplot(132, facecolor = 'darkgrey');
ax = sns.boxplot(x = "kmeans_labels", y = col, data = X_all_together);
average_values = X_all_together.groupby(['kmeans_labels'])[col].mean().values
average_labels = [str(np.round(s, 2)) for s in average_values]
pos = range(len(average_values))
for tick, label in zip(pos, ax.get_xticklabels()):
_ = ax.text(pos[tick], average_values[tick], average_labels[tick],
horizontalalignment = 'center', size = 'small', color = 'w', weight = 'semibold')
# ... cluster color map
_ = plt.subplot(133, facecolor = 'darkgrey');
_ = plt.scatter(X_all_together['x-tsne'], X_all_together['y-tsne'],
c = kmeans_labels,
cmap = plt.cm.tab20,
s = 50,
linewidths = 0,
alpha = 0.30)
_ = plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
c = range(n_lda),
cmap = plt.cm.tab20b,
s = 200,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.50);
for ii in range(n_lda) :
_ = plt.text(kmeans_centers[ii, 0], kmeans_centers[ii, 1], ii, fontsize = 20)
_ = plt.xlabel('t-SNE axis 1')
_ = plt.ylabel('t-SNE axis 2')
_ = plt.title('t-SNE 2-D mapping')
plt.savefig("cluster_kmeans_3way_preplx_100_%s.png" %col)
_ = plt.show();